# Load required libraries
library(readxl)
## Warning: package 'readxl' was built under R version 4.3.2
library(forecast)
## Warning: package 'forecast' was built under R version 4.3.2
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(car)
## Warning: package 'car' was built under R version 4.3.2
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.3.2
library(rgl)
## Warning: package 'rgl' was built under R version 4.3.2
library(plotly)
## Warning: package 'plotly' was built under R version 4.3.2
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.3.2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
# Import data from Excel file
data <- read_excel("C:/Users/tzp3890/OneDrive - Western Michigan University/Project 2/GS1M.xlsx")
data2 <- read_excel("C:/Users/tzp3890/OneDrive - Western Michigan University/Project 2/DGS10.xlsx")
# Convert observation_date column to Date format
data$observation_date <- as.Date(data$observation_date)
data2$observation_date <- as.Date(data2$observation_date)
GS1M_values <- data$GS1M
DGS10_values <- data2$DGS10

Calculating ACFs and PACFs

### ACF for GS1M
acf(GS1M_values)

### PACF for GS1M
pacf(GS1M_values)

### ACF for GS1M
acf(DGS10_values)

### PACF for GS1M
pacf(DGS10_values)

Second Difference ACF

pacf(diff(GS1M_values))

We see that each that both series are highly correlated for each of of their lags till at least lag-15 by looking at their respective ACFs but we see that if we remove the effect of shorter lags by looking at their PACFs, they are insignificant. We see that GS1M shares a slightly stronger relationship with its deeper lags than DGS10. The difference between the decreasing ACF lags and difference from PACF could suggest non stationary and using the first or second difference in model but for this time, I have decided to use the original values.

Fitting Regressions

# Fit Moving Average (MA) Model
ma_model2 <- Arima(DGS10_values, order=c(0, 0, 1))
ma_model <- Arima(GS1M_values, order=c(0, 0, 1))
# Order(p, d, q): p=0 (AR), d=1 (differencing), q=1 (MA)
# Fit Autoregressive (AR) Model
ar_model2 <- Arima(DGS10_values, order=c(1, 0, 0))
ar_model <- Arima(GS1M_values, order=c(1, 0, 0))
# Order(p, d, q): p=1 (AR), d=0 (differencing), q=0 (MA)

Coefficients and Error Score for GSM1 - MA

# Summarizing MA Model (GSM1)
summary(ma_model)
## Series: GS1M_values 
## ARIMA(0,0,1) with non-zero mean 
## 
## Coefficients:
##          ma1    mean
##       0.9423  1.4231
## s.e.  0.0185  0.1054
## 
## sigma^2 = 0.8067:  log likelihood = -355.52
## AIC=717.03   AICc=717.12   BIC=727.84
## 
## Training set error measures:
##                        ME      RMSE       MAE  MPE MAPE     MASE      ACF1
## Training set -0.002211166 0.8948397 0.7103834 -Inf  Inf 5.991988 0.8276147

The mean used in our model in excel is close to the intercept shown in model. Also the using the coefficient from Q4 was a good guess, the value ¬0.99 is close to the optimized value of 0.9423. The AIC is extremely high which suggests the model doesn’t capture data well so should be used with caution.

Coefficients and Error Score for GSM1 - AR

# Summarizing AR Model (GSM1)
summary(ar_model)
## Series: GS1M_values 
## ARIMA(1,0,0) with non-zero mean 
## 
## Coefficients:
##          ar1   mean
##       0.9947  3.279
## s.e.  0.0048  1.819
## 
## sigma^2 = 0.05095:  log likelihood = 17.56
## AIC=-29.12   AICc=-29.03   BIC=-18.31
## 
## Training set error measures:
##                        ME      RMSE       MAE  MPE MAPE     MASE      ACF1
## Training set -0.002941103 0.2248958 0.1202839 -Inf  Inf 1.014578 0.2758153

I used the a linear regression in excel to estimate the coefficients but as we talked in class, this algorithm doesn’t optimize the coefficients to minimalize the error score so our values are different. Our slope coefficient is fairly similar but the intercept are very different.

Coefficients and Error Score for DGS10 - MA

# Summarizing MA Model (DGS10)
summary(ma_model2)
## Series: DGS10_values 
## ARIMA(0,0,1) with non-zero mean 
## 
## Coefficients:
##          ma1    mean
##       0.9651  2.2689
## s.e.  0.0052  0.0362
## 
## sigma^2 = 0.4086:  log likelihood = -1164.15
## AIC=2334.3   AICc=2334.32   BIC=2349.57
## 
## Training set error measures:
##                        ME      RMSE       MAE       MPE    MAPE     MASE
## Training set 0.0001189877 0.6387058 0.5620178 -21.73871 36.4544 12.45806
##                   ACF1
## Training set 0.9387602

Extremely high AIC. This share the same results as the moving average of GS1M as our coefficient and intercept are very similar.

Coefficients and Error Score for DGS10 - AR

# Summarizing AR Model (DGS10)
summary(ar_model2)
## Series: DGS10_values 
## ARIMA(1,0,0) with non-zero mean 
## 
## Coefficients:
##          ar1    mean
##       0.9988  2.2942
## s.e.  0.0012  1.0316
## 
## sigma^2 = 0.003653:  log likelihood = 1659.79
## AIC=-3313.58   AICc=-3313.56   BIC=-3298.31
## 
## Training set error measures:
##                       ME       RMSE        MAE         MPE     MAPE     MASE
## Training set 0.001611576 0.06039056 0.04518806 -0.07034281 2.457796 1.001669
##                    ACF1
## Training set 0.00994548

Forecasts

All MA models can only forecast one value into the future (h=1) since they all only use 1 lag.

Forecasts for GS1M - MA Model

# Forecasting for GS1M - MA Model
ma_forecast <- forecast(ma_model, h = 1)  # Forecast the next 10 steps

# Summarizing Model
summary(ma_forecast)
## 
## Forecast method: ARIMA(0,0,1) with non-zero mean
## 
## Model Information:
## Series: GS1M_values 
## ARIMA(0,0,1) with non-zero mean 
## 
## Coefficients:
##          ma1    mean
##       0.9423  1.4231
## s.e.  0.0185  0.1054
## 
## sigma^2 = 0.8067:  log likelihood = -355.52
## AIC=717.03   AICc=717.12   BIC=727.84
## 
## Error measures:
##                        ME      RMSE       MAE  MPE MAPE     MASE      ACF1
## Training set -0.002211166 0.8948397 0.7103834 -Inf  Inf 5.991988 0.8276147
## 
## Forecasts:
##     Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## 272        4.04228 2.891241 5.193318 2.281918 5.802641

Forecasts for GS1M - AR Model

# Forecasting for GS1M - AR Model
ar_forecast <- forecast(ar_model, h = 10)  # Forecast the next 10 steps

# Summarizing Model
summary(ar_forecast)
## 
## Forecast method: ARIMA(1,0,0) with non-zero mean
## 
## Model Information:
## Series: GS1M_values 
## ARIMA(1,0,0) with non-zero mean 
## 
## Coefficients:
##          ar1   mean
##       0.9947  3.279
## s.e.  0.0048  1.819
## 
## sigma^2 = 0.05095:  log likelihood = 17.56
## AIC=-29.12   AICc=-29.03   BIC=-18.31
## 
## Error measures:
##                        ME      RMSE       MAE  MPE MAPE     MASE      ACF1
## Training set -0.002941103 0.2248958 0.1202839 -Inf  Inf 1.014578 0.2758153
## 
## Forecasts:
##     Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## 272       5.527968 5.238683 5.817253 5.085545 5.970391
## 273       5.516000 5.107977 5.924024 4.891982 6.140019
## 274       5.504096 5.005697 6.002495 4.741860 6.266332
## 275       5.492255 4.918277 6.066234 4.614431 6.370079
## 276       5.480477 4.840446 6.120509 4.501634 6.459321
## 277       5.468762 4.769493 6.168031 4.399323 6.538202
## 278       5.457110 4.703802 6.210417 4.305025 6.609195
## 279       5.445519 4.642316 6.248722 4.217126 6.673912
## 280       5.433990 4.584300 6.283680 4.134501 6.733478
## 281       5.422522 4.529216 6.315828 4.056329 6.788715

Forecast for DGS10 - MA Model

# Forecasting for DGS10 - MA Model
ma_forecast2 <- forecast(ma_model2, h = 1)  # Forecast the next 10 steps

# Summarizing Model
summary(ma_forecast2)
## 
## Forecast method: ARIMA(0,0,1) with non-zero mean
## 
## Model Information:
## Series: DGS10_values 
## ARIMA(0,0,1) with non-zero mean 
## 
## Coefficients:
##          ma1    mean
##       0.9651  2.2689
## s.e.  0.0052  0.0362
## 
## sigma^2 = 0.4086:  log likelihood = -1164.15
## AIC=2334.3   AICc=2334.32   BIC=2349.57
## 
## Error measures:
##                        ME      RMSE       MAE       MPE    MAPE     MASE
## Training set 0.0001189877 0.6387058 0.5620178 -21.73871 36.4544 12.45806
##                   ACF1
## Training set 0.9387602
## 
## Forecasts:
##      Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## 1199       3.093965 2.274746 3.913183 1.841078 4.346851

Forecast for DGS10 - AR Model

# Forecasting for DGS10 - AR Model
ar_forecast2 <- forecast(ar_model2, h = 10)  # Forecast the next 10 steps

# Summarizing Model
summary(ar_forecast2)
## 
## Forecast method: ARIMA(1,0,0) with non-zero mean
## 
## Model Information:
## Series: DGS10_values 
## ARIMA(1,0,0) with non-zero mean 
## 
## Coefficients:
##          ar1    mean
##       0.9988  2.2942
## s.e.  0.0012  1.0316
## 
## sigma^2 = 0.003653:  log likelihood = 1659.79
## AIC=-3313.58   AICc=-3313.56   BIC=-3298.31
## 
## Error measures:
##                       ME       RMSE        MAE         MPE     MAPE     MASE
## Training set 0.001611576 0.06039056 0.04518806 -0.07034281 2.457796 1.001669
##                    ACF1
## Training set 0.00994548
## 
## Forecasts:
##      Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## 1199       4.067866 3.990407 4.145324 3.949403 4.186328
## 1200       4.065734 3.956257 4.175210 3.898303 4.233164
## 1201       4.063604 3.929604 4.197605 3.858668 4.268541
## 1202       4.061478 3.906840 4.216115 3.824980 4.297976
## 1203       4.059354 3.886567 4.232140 3.795100 4.323608
## 1204       4.057232 3.868067 4.246396 3.767930 4.346534
## 1205       4.055113 3.850914 4.259311 3.742818 4.367407
## 1206       4.052996 3.834830 4.271163 3.719339 4.386653
## 1207       4.050882 3.819620 4.282144 3.697198 4.404567
## 1208       4.048771 3.805145 4.292396 3.676178 4.421364

All models don’t seem to forecast very well as very converge very quickly.

Question 8

Considering the models programmed in Q7, I would use the AR model for GS1M since it has a lower Root Mean Squared Error (RMSE). The MA model has a better ME scored but RMSE weighs higher errors more heavily which I’d like to avoid, hence why I have decided to use RMSE. For DGS10, I would also use an AR model for the same reason: a lower RMSE score.

Considering all models, I have a preference for the EMA(1) model for DGS10 because I find it easy to understand the mechanics behind and the combination of a good accuracy score and an adjustable smoothing factor allows for more insights. I value the smoothing for DGS10 over GS1M because DGS10 has a more consistent increase whilst the erratic peaks and inconsistent plateau of GS1M doesn’t work when decreasing the smooth factor.

I wasn’t sure about GS1M so I decided try and make my AR model more accurate by changing the parameters. I switched to using the first difference to help make the series more stationary. I also used up to 15 lags inferred from the ACF plots. I also tested up 20 lag but the error score wasn’t significantly impacted. This model is the most accurate out of the models across the error scores and also has a low AIC, capturing most of the data. Also, changing the no. of lags and the order of differencing had a positive of impact on the forecast which looks a lot more reliable.

# Plot values
plot(data$GS1M)

# Plot first differences
plot(diff(data$GS1M))

Series looks more stationary.

Box.test(data$GS1M, lag = 15, type = "Ljung")
## 
##  Box-Ljung test
## 
## data:  data$GS1M
## X-squared = 2188.5, df = 15, p-value < 2.2e-16

The lags exhibit serial correlation

# Fit Autoregressive (AR) Model
fin_mod <- Arima(GS1M_values, order=c(15, 1, 0))
# Order(p, d, q): p=5 (AR), d=1 (differencing), q=0 (MA)

summary(fin_mod)
## Series: GS1M_values 
## ARIMA(15,1,0) 
## 
## Coefficients:
##          ar1     ar2     ar3     ar4      ar5     ar6     ar7      ar8     ar9
##       0.1860  0.0611  0.2019  0.1371  -0.1744  0.1911  0.1724  -0.0343  0.0972
## s.e.  0.0607  0.0631  0.0627  0.0643   0.0650  0.0651  0.0660   0.0679  0.0743
##          ar10    ar11   ar12     ar13    ar14     ar15
##       -0.1258  0.0322  0.126  -0.1692  0.0200  -0.0579
## s.e.   0.0755  0.0750  0.075   0.0737  0.0743   0.0715
## 
## sigma^2 = 0.03943:  log likelihood = 60.29
## AIC=-88.58   AICc=-86.43   BIC=-31
## 
## Training set error measures:
##                       ME      RMSE      MAE  MPE MAPE      MASE         ACF1
## Training set 0.002635001 0.1926092 0.109349 -Inf  Inf 0.9223443 -0.000969829
# Forecasting for AR Model
fin_forecast <- forecast(fin_mod, h = 10)  # Forecast the next 10 steps

# Summarizing Model
summary(fin_forecast)
## 
## Forecast method: ARIMA(15,1,0)
## 
## Model Information:
## Series: GS1M_values 
## ARIMA(15,1,0) 
## 
## Coefficients:
##          ar1     ar2     ar3     ar4      ar5     ar6     ar7      ar8     ar9
##       0.1860  0.0611  0.2019  0.1371  -0.1744  0.1911  0.1724  -0.0343  0.0972
## s.e.  0.0607  0.0631  0.0627  0.0643   0.0650  0.0651  0.0660   0.0679  0.0743
##          ar10    ar11   ar12     ar13    ar14     ar15
##       -0.1258  0.0322  0.126  -0.1692  0.0200  -0.0579
## s.e.   0.0755  0.0750  0.075   0.0737  0.0743   0.0715
## 
## sigma^2 = 0.03943:  log likelihood = 60.29
## AIC=-88.58   AICc=-86.43   BIC=-31
## 
## Error measures:
##                       ME      RMSE      MAE  MPE MAPE      MASE         ACF1
## Training set 0.002635001 0.1926092 0.109349 -Inf  Inf 0.9223443 -0.000969829
## 
## Forecasts:
##     Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## 272       5.653839 5.399374 5.908304 5.264669 6.043009
## 273       5.448865 5.054109 5.843620 4.845138 6.052591
## 274       5.475821 4.963759 5.987884 4.692690 6.258953
## 275       5.681008 5.040392 6.321625 4.701270 6.660746
## 276       5.417362 4.639166 6.195559 4.227214 6.607511
## 277       5.469386 4.583646 6.355127 4.114763 6.824010
## 278       5.460479 4.453713 6.467246 3.920763 7.000196
## 279       5.413923 4.263843 6.564003 3.655027 7.172819
## 280       5.305388 4.021094 6.589682 3.341230 7.269546
## 281       5.375244 3.950643 6.799844 3.196505 7.553982
layout(matrix(1:2, nrow = 1))
plot(fin_forecast)
plot(fin_forecast, main = "Zoomed In" ,xlim = c(250, 300))

Exploration

# Create an empty data frame to store results
results_df <- data.frame(p = numeric(), q = numeric(), AIC = numeric())

# Define the range of p and q values
p_values <- 1:10
q_values <- 1:10

# Loop through each combination of p and q values
for (p in p_values) {
  for (q in q_values) {
    
    # Fit ARIMA model
    arima_model <- Arima(GS1M_values, order = c(p, 1, q))
    
      # Get AIC
      aic <- AIC(arima_model)
      
      # Append results to data frame
      results_df <- rbind(results_df, data.frame(p = p, q = q, AIC = aic))
  }
}

# Print the first few rows of the results
print(head(results_df))
##   p q       AIC
## 1 1 1 -78.45630
## 2 1 2 -76.78491
## 3 1 3 -80.09598
## 4 1 4 -74.56527
## 5 1 5 -73.61364
## 6 1 6 -93.18428
#3D plot of p and q parameters of ARIMA(p,1,q) AIC scores
plot = scatter3d(x = results_df$p, y = results_df$AIC, z = results_df$q)
## Loading required namespace: mgcv
## Loading required namespace: MASS

The script above needs to be run in R to see the 3D plot.

The 3D scatter plots shows that p = 5, q = 4 yields the ARIMA(p,1,q) with the lowest AIC and is therefore the best model to use with a lag of one. We can also the relationship as there is the further lags seem to negatively impact the AIC score. Here is a similar interactive graph below but is somewhat harder to interpret.

plot_ly(data = results_df, x = ~p, y = ~AIC, z = ~q, type = "scatter3d", mode = "markers")